
from lxml import etree

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>aTitle发发发1</title>
</head>
<body>
    <ul>
        <li id="item1">醒醒啦1</li>
        <li id="item2">醒醒啦2</li>
        <li id="item3">醒醒啦3</li>
        <li id="item4">醒醒啦4</li>
        <li id="item5">醒醒啦5</li>
    </ul>

    <p>
        hello
        i
        am
        iron
        man
    </p>
</body>
</html>
"""

tree = etree.HTML(html)
# print(tree)
print(tree.tag, tree.text,tree.attrib['lang'])


# title
title = tree.find("head").find("title")
print(title.tag, title.text)

# li
lis = tree.find("body").find("ul").findall("li")
for li in lis:
    print(li.attrib['id'], li.text)

# p
p = tree.find("body").find("p")
print(p.text)
